In [1]:
from bs4 import BeautifulSoup
import requests
import arrow
def get_users(url="http://vidstatsx.com/youtube-top-200-most-subscribed-channels"):
"""Get the users from a VidStatsX page."""
r = requests.get(url)
soup = BeautifulSoup(r.text)
return [x.get('id') for x in soup.find_all("td") if x.get('id') is not None]
Now that we have a function to get the users, we can ask YouTube for information about them, including the start dates. From there, we can convert those dates into ages using a third function.
In [2]:
def get_start_dates(users):
request_url = "https://www.googleapis.com/youtube/v3/channels?part=snippet&forUsername="
key = "&key=AIzaSyCZx95H8pP-csC_6G8mF5tv-kW_U20HJKs"
responses = [ requests.get(request_url + x + key) for x in users] #Raw content from YouTube
return [x.json()['items'][0]['snippet'].get('publishedAt') for x in responses if len(x.json()['items']) > 0]
def get_ages(users):
start_dates = get_start_dates(users)
return [int((arrow.now() - arrow.get(x)).days) for x in start_dates]
With all of our functions written, we can use them to find the dates of the top 1000 channels.
Note that after the top 200, the pages start where the last one left off, so the top 500 most subscribed channels page includes only channels from 201 to 500.
In [3]:
top_users_ages = get_ages(get_users() +
get_users("http://vidstatsx.com/youtube-top-500-most-subscribed-channels") +
get_users("http://vidstatsx.com/youtube-top-750-most-subscribed-channels") +
get_users("http://vidstatsx.com/youtube-top-1000-most-subscribed-channels"))
VidStatsX also includes charts by category, so we can get the results by category, too.
In [4]:
edu_ages = get_ages(get_users("http://vidstatsx.com/youtube-top-100-most-subscribed-education-channels"))
gaming_ages = get_ages(get_users("http://vidstatsx.com/youtube-top-100-most-subscribed-games-gaming-channels"))
In [5]:
url_prefix = "http://vidstatsx.com/youtube-top-100-most-subscribed-"
categories = {"Educational": get_ages(get_users(url_prefix + "education-channels")),
"Games and Gaming": get_ages(get_users(url_prefix + "games-gaming-channels")),
"Autos and Vehicles": get_ages(get_users(url_prefix + "autos-vehicles-channels")),
"Comedy": get_ages(get_users(url_prefix + "comedy-channels")),
"Entertainment": get_ages(get_users(url_prefix + "entertainment-channels")),
"Film and Animation": get_ages(get_users(url_prefix + "film-animation-channels")),
"How To and Style": get_ages(get_users(url_prefix + "how-to-style-channels")),
"Music": get_ages(get_users(url_prefix + "music-channels")),
"News and Politics": get_ages(get_users(url_prefix + "news-politics-channels")),
"Nonprofit and Activism": get_ages(get_users(url_prefix + "nonprofit-activism-channels")),
"People and Vlogs": get_ages(get_users(url_prefix + "people-vlogs-channels")),
"Pets and Animals": get_ages(get_users(url_prefix + "pets-animals-channels")),
"Science and Tech": get_ages(get_users(url_prefix + "science-tech-channels")),
"Shows": get_ages(get_users(url_prefix + "shows-channels")),
"Sports": get_ages(get_users(url_prefix + "sports-channels")),
"Travel and Events": get_ages(get_users(url_prefix + "travel-events-channels"))
}
In [6]:
[(len(y), x) for x, y in categories.iteritems()]
Out[6]:
The average channel in the top 1000 channels by most subscribers is about six years old.
(Note: VidStatsX treats YouTube's automatically generated channels, like #Music, as real channels and many of them have enough subscribers to be in the top 1000. Since the YouTube API (and most people, probably) do not consider them real channels, they're not included. Without these channels, there are only 964 channels in the dataset.)
In [7]:
def median(l):
l = sorted(l) #sort the list
if len(l) % 2 == 1: #Even number of items
return float(l[len(l)/2])
else:
return float(l[len(l)/2]+l[(len(l)/2)-1])/2
average_age = sum(top_users_ages)/len(top_users_ages)
median_age = median(top_users_ages)
print("Average (days): " + str(average_age) + "; Median: " + str(median_age))
print("Average (years): " + str(average_age/365.0) + "; Median: " + str(median_age/365.0))
print("Number of channels: " + str(len(top_users_ages)))
What about different types of channel?
Of the categories discussed on Cortex, Educational channels are slightly older than the average channel, by about six months. Gaming channels are on average only slightly younger, but the median gaming is much younger than the median channel, by about seven months.
In [8]:
edu_average_age = sum(edu_ages)/len(edu_ages)
edu_median_age = median(edu_ages)
print("Educational Average: " + str(edu_average_age/365.0) + "; Median: " + str(edu_median_age/365.0))
gaming_average_age = sum(gaming_ages)/len(gaming_ages)
gaming_median_age = median(gaming_ages)
print("Gaming Average: " + str(gaming_average_age/365.0) + "; Median: " + str(gaming_median_age/365.0))
Data for the other categories listed on VidStatsX:
In [ ]:
for category_name, category_ages in categories.iteritems():
average_age = sum(category_ages)/len(category_ages)
median_age = median(category_ages)
print(category_name + "\tAverage:\t" + str(average_age/365.0) + "\tMedian:\t" + str(median_age/365.0))